In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
print(tf.__version__)
In [3]:
imdb = keras.datasets.imdb
(train_data, train_label),(test_data,test_label) = imdb.load_data(num_words=10000)
The argument num_words=10000 keeps the top 10,000 most frequently occurring words in the training data. The rare words are discarded to keep the size of the data manageable.
In [10]:
print("Train data shape:",train_data.shape)
print("Test data shape:",test_data.shape)
print("Train label :",len(train_label))
print("First Imdb review: ",train_data[0]) ## review data for the first review
## notice the difference in length of 2 reviews
print("length of first and second review:",len(train_data[0])," ",len(test_data[1]))
In [20]:
## A dictionary mapping of a word to a integer index
word_index = imdb.get_word_index()
## The first indices are reserved
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2 ## unknown
word_index["<UNUSED>"] = 3
word_index = {k:(v+3) for k,v in word_index.items()}
reverse_word_index = dict([(value, key) for (key,value) in word_index.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i,'?') for i in text])
In [21]:
decode_review(train_data[0])
Out[21]:
we can pad the arrays so they all have the same length, then create an integer tensor of shape max_length * num_reviews. We can use an embedding layer capable of handling this shape as the first layer in our network.
Since the movie reviews must be the same length, we will use the pad_sequences function to standardize the lengths
In [22]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
value = word_index["<PAD>"],
padding='post',
maxlen = 256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
value = word_index["<PAD>"],
padding = 'post',
maxlen = 256)
In [23]:
print(len(train_data[0])," ",len(test_data[1]))
In [24]:
print(train_data[0])
In [27]:
# input shape is the vocabulary count used in the reviews i.e. word count = 10,000
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation = tf.nn.relu))
model.add(keras.layers.Dense(1, activation = tf.nn.sigmoid))
model.summary()
In [28]:
### adding the loss function and optimizer
model.compile(optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = ['acc'])
In [32]:
### creating a validation data set to test the training accuracy
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_label[:10000]
partial_y_train = train_label[10000:]
In [33]:
history = model.fit(partial_x_train,
partial_y_train,
epochs=40,
batch_size=512,
validation_data=(x_val, y_val),
verbose=1)
In [34]:
results = model.evaluate(test_data, test_label)
print(results)
In [36]:
history_dict = history.history
history_dict.keys()
Out[36]:
In [37]:
import matplotlib.pyplot as plt
acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
In [38]:
plt.clf() # clear figure
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
In [ ]: